#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Created: May 27, 2019
# Modified: Dec 11, 2020
# Author: Gloria

# DESCRIPTIVES OF TFIDF & CONTEXTS

# PACKAGES #############################################################

from nltk import ngrams
import string
import re
import pandas as pd
import os

from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import stem_text
from gensim.utils import simple_preprocess
from nltk import sent_tokenize
from collections import Counter
from nltk.probability import FreqDist

translator = str.maketrans('','',string.punctuation) 


# SELECT COMPUTER ###############################################

# computer = 'gloria'
computer = 'glgennaro'      # Select for work


# DICTIONARY ################################################################
# dizionario pop

pop_us = ['absurd', 'absurdli', 'admit', 'admitt', 'arrog', 'arrogantli',
         'betrai', 'cast', 'class', 'corrupt', 'deceit', 'direct', 'directli',
        'directori', 'elit', 'elitist', 'establish', 'peopl', 'polit', 'politic', 'politician',
    'promin', 'promis', 'propaganda', 'referendum', 'regim', 'regimen', 'rule',
    'shame', 'tradit', 'tradition', 'treason', 'undemocrat'] #'promissori'


# PRESIDENTIAL 2016 ###############################################

os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/topics')
trump = pd.read_csv('all_punctuated_trump.csv', sep=',', encoding='utf-8')
trump = trump.drop('Unnamed: 0', 1)
trump['candidate'] = 'trump'

clinton = pd.read_csv('all_punctuated_clinton.csv', sep=',', encoding='utf-8')
clinton = clinton.drop('Unnamed: 0', 1)
clinton = clinton.drop('presidency', 1)
clinton['candidate'] = 'clinton'

df = pd.concat([trump, clinton])

text = list(df['text'])

text = [i.strip("/") for i in text]
text=[re.sub(r'\.(?! )', '. ', re.sub(r' +', ' ', t)) for t in text]  #to add spaces after full stops. in the row text many mistakes 
text=[t.replace(']', '] ') for t in text]
text=[t.replace(':', ': ') for t in text]
text=[re.sub("[\[].*?[\]]", "", t) for t in text]
daeliminare = ['â€”','â€“','Â','“','-','--',] # add here all words we want to eliminare
for i in daeliminare:
    text=[t.replace(i, ' ') for t in text]
sep = 'APP Note:' # per eliminare i commenti a margine - le note finali dell'editore
text = [t.split(sep, 1)[0] for t in text]
sep = 'NOTE:'
text = [t.split(sep, 1)[0] for t in text]
sep = 'Citation:'
text = [t.split(sep, 1)[0] for t in text]
del sep

sent = [sent_tokenize(a) for a in text]
sent = [item for sublist in sent for item in sublist]
sent = [remove_stopwords(a.translate(translator)).lower() for a in sent]
sent = [a for a in sent if len(a.split()) >= 10]
sent = [stem_text(a) for a in sent]
sent = [simple_preprocess(a) for a in sent]


# Create all 5-grams
grams = []
for i in range(len(sent)):
    grams += list(ngrams(sent[i], 5)) 


# Select the most common 5-grams around each word
contexts = []
for i in pop_us:
    l = [a for a in grams if i in a]
    if len(l)>=3:    
        b = Counter(l).most_common()[:5]
        c = [i, b[0][0], b[1][0], b[2][0]]
        contexts.append(c)
    else:
        continue

contexts = pd.DataFrame(contexts)

# Select the 6 most common words around the dictionary word
parole = []
for i in pop_us:
    l = [a for a in grams if i in a]
    l = [item for sublist in l for item in sublist]
    fdist = FreqDist(l)
    a = fdist.most_common(6)
    a = [s[0] for s in a if i not in s[0]]
    a = [i] + [a]
    parole.append(a)

parole = pd.DataFrame(parole)

os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/output_text_analysis')
contexts.to_csv('contexts_pres.csv')
parole.to_csv('frequent_context_words_pres.csv')


# CONGRESS 2018 ###############################################

os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/dta/legislative')
df = pd.read_csv('legislative2018_mod.CSV', sep=',', encoding='windows-1252')
df = df.dropna()
text = list(df['text'])

sent = [sent_tokenize(a) for a in text]
sent = [item for sublist in sent for item in sublist]
sent = [remove_stopwords(a.translate(translator)).lower() for a in sent]
sent = [a for a in sent if len(a.split())>=10]
sent = [stem_text(a) for a in sent]
sent = [simple_preprocess(a) for a in sent]


# Create all 5-grams
grams = []
for i in range(len(sent)):
    grams += list(ngrams(sent[i], 5)) 

contexts = []
for i in pop_us:
    l = [a for a in grams if i in a]
    if len(l)>=3:    
        b = Counter(l).most_common()[0]
        c = [i, b]
        #b = Counter(l).most_common()[:5]
        #c = [i, b[0][0], b[1][0], b[2][0]]
        contexts.append(c)
    else:
        continue

parole = []
for i in pop_us:
    l = [a for a in grams if i in a]
    l = [item for sublist in l for item in sublist]
    fdist = FreqDist(l)
    a = fdist.most_common(6)
    a = [s[0] for s in a if i not in s[0]]
    a = [i] + [a]
    parole.append(a)


contexts = pd.DataFrame(contexts)
parole = pd.DataFrame(parole)

os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/output_text_analysis')
contexts.to_csv('contexts_cong.csv')
parole.to_csv('frequent_context_words_cong.csv')


# CONGRESS 2020 ###############################################

os.chdir("/Users/glgennaro/Dropbox/Progetti/Rhetoric/Raw data/Congress 2020")

df = pd.read_excel('dataset_house.xlsx', sheet_name='txt')

# Drop columns with topic titles
df = df.loc[:, ~df.columns.str.startswith('Title_')]

# Put text together
selection = [a for a in list(df) if a.startswith('topic_')]
selection = df[selection].astype(str)
selection[selection == 'nan'] = ''
selection = selection.agg(' '.join, axis=1)

text = list(selection)

sent = [sent_tokenize(a) for a in text]
sent = [item for sublist in sent for item in sublist]
sent = [remove_stopwords(a.translate(translator)).lower() for a in sent]
sent = [a for a in sent if len(a.split())>=10]
sent = [stem_text(a) for a in sent]
sent = [simple_preprocess(a) for a in sent]


# Create all 5-grams
grams = []
for i in range(len(sent)):
    grams += list(ngrams(sent[i], 5))

contexts = []
for i in pop_us:
    l = [a for a in grams if i in a]
    if len(l)>=3:
        b = Counter(l).most_common()[0]
        c = [i, b]
        contexts.append(c)
    else:
        continue

parole = []
for i in pop_us:
    l = [a for a in grams if i in a]
    l = [item for sublist in l for item in sublist]
    fdist = FreqDist(l)
    a = fdist.most_common(6)
    a = [s[0] for s in a if i not in s[0]]
    a = [i] + [a]
    parole.append(a)


contexts = pd.DataFrame(contexts)
parole = pd.DataFrame(parole)

os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/output_text_analysis')
contexts.to_csv('contexts_cong2020.csv')
parole.to_csv('frequent_context_words_cong2020.csv')




# CONGRESS 2020 ###############################################

os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/topics')
trump = pd.read_csv('all_punctuated_trump.csv', sep=',', encoding='utf-8')
trump = trump.drop('Unnamed: 0', 1)
trump['candidate'] = 'trump'

clinton = pd.read_csv('all_punctuated_clinton.csv', sep=',', encoding='utf-8')
clinton = clinton.drop('Unnamed: 0', 1)
clinton = clinton.drop('presidency', 1)
clinton['candidate'] = 'clinton'

df = pd.concat([trump, clinton])
text = list(df['text'])

os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/dta/legislative')
df = pd.read_csv('legislative2018_mod.CSV', sep=',', encoding='windows-1252')
df = df.dropna()
text1 = list(df['text'])


os.chdir("/Users/glgennaro/Dropbox/Progetti/Rhetoric/Raw data/Congress 2020")
df = pd.read_excel('dataset_house.xlsx', sheet_name='txt')
df = df.loc[:, ~df.columns.str.startswith('Title_')]
selection = [a for a in list(df) if a.startswith('topic_')]
selection = df[selection].astype(str)
selection[selection == 'nan'] = ''
selection = selection.agg(' '.join, axis=1)
text2 = list(selection)



text = text + text1 + text2

text = [i.strip("/") for i in text]
text=[re.sub(r'\.(?! )', '. ', re.sub(r' +', ' ', t)) for t in text]  #to add spaces after full stops. in the row text many mistakes 
text=[t.replace(']', '] ') for t in text]
text=[t.replace(':', ': ') for t in text]
text=[re.sub("[\[].*?[\]]", "", t) for t in text]
daeliminare = ['â€”','â€“','Â','“','-','--',] # add here all words we want to eliminare
for i in daeliminare:
    text=[t.replace(i, ' ') for t in text]
sep = 'APP Note:' # per eliminare i commenti a margine - le note finali dell'editore
text = [t.split(sep, 1)[0] for t in text]
sep = 'NOTE:'
text = [t.split(sep, 1)[0] for t in text]
sep = 'Citation:'
text = [t.split(sep, 1)[0] for t in text]
del sep
    
sent = [sent_tokenize(a) for a in text]
sent = [item for sublist in sent for item in sublist]
sent = [remove_stopwords(a.translate(translator)).lower() for a in sent]
sent = [a for a in sent if len(a.split())>=10]
sent = [stem_text(a) for a in sent]
sent = [simple_preprocess(a) for a in sent]

grams = []
for i in range(len(sent)):
    grams += list(ngrams(sent[i], 5)) 

contexts = []
for i in pop_us:
    l = [a for a in grams if i in a]
    if len(l)>=3:    
        b = Counter(l).most_common()[:5]
        c = [i, b[0][0], b[1][0], b[2][0]]
        contexts.append(c)
    else:
        continue

contexts = pd.DataFrame(contexts)

parole = []
for i in pop_us:
    l = [a for a in grams if i in a]
    l = [item for sublist in l for item in sublist]
    fdist = FreqDist(l)
    a = fdist.most_common(6)
    a = [s[0] for s in a if i not in s[0]]
    a = [i] + [a]
    parole.append(a)

parole = pd.DataFrame(parole)

os.chdir('/Users/' + computer + '/Dropbox/Progetti/Rhetoric/output_text_analysis')
contexts.to_csv('contexts_tutto.csv')
parole.to_csv('frequent_context_words_tutto.csv')


